This code is for the results to the question: Is there stability across caregivers, regardless of activity?
library(tidyverse)
library(GGally)
library(ppcor)
library(psych)
library(Hmisc)
library(sjPlot)
# https://github.com/ggobi/ggally/issues/139
my_custom_smooth <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
geom_point(alpha = .4, color = I("black")) +
geom_smooth(method = "lm", color = I("blue"), ...)
}
# demographics
demo_english <- read_csv("./data_demo_lena_transcripts/demo_english_ms.csv") %>%
rename(id = ID, hi = HI24Final, momed = Momed) %>%
dplyr::select(id, hi, momed) %>%
mutate(id = as.character(id),
language = "english")
demo_spanish <- read_csv("./data_demo_lena_transcripts/demo_spanish_ms.csv") %>%
rename(id = ID, hi = HI_18, momed = MomEd_25m) %>%
dplyr::select(id, hi, momed) %>%
mutate(id = as.character(id),
language = "spanish")
demo <- rbind(demo_english, demo_spanish)
# NOTE about periods of non-tCDCS
# gemods refers to when there are designated start/end periods of other-directed speech (ODS); this was captured using gems (@G) using CHAT conventions
# kwalods refers to when ODS was transcribed at an utterance-level within a tCDS activity period between caregiver and child (e.g., other-directed speech in the background); this was captured per utterances using CHAT postcodes
## for tokens/min and types/min, we do not include ODS that occurred within a period of tCDS, because durations were captured by activity and not by utterance
## for mlu, we include all ODS across gemods and kwalods
# NOTE about speech == "all"
# "speech" includes two levels: all, spont
# all = refers to all speech by caregivers
# spont = refers to only speech by caregivers that was considered spontaneous rather than recited (e.g., reading book text, singing memorized common songs like itsy bitsy spider); therefore, 'spont' is a subset of 'all'
# freq
freq <- read_csv("./data_demo_lena_transcripts/freq.csv") %>%
filter(activity != "kwalods",
speech == "all") %>%
mutate(activity = recode(activity, "play" = "cc", "conv" = "cc",
"food" = "cc", "routines" = "cc", "gemods" = "non_tcds")) %>% # we collapsed across play, routines, feeding, and unstructured conversation for the final paper [see supplemental for all categories]
group_by(id, activity, segment_num, speaker) %>%
mutate(dur_min2 = sum(dur_min),
tokens2 = sum(tokens),
types2 = mean(types)) %>%
distinct(id, activity, language, speaker, segment_num, tokens2, types2, dur_min2) %>%
mutate(tokens_permin2 = tokens2/dur_min2,
types_permin2 = types2/dur_min2) %>%
distinct(id, activity, language, speaker, segment_num, tokens_permin2, types_permin2) %>%
ungroup() %>%
mutate(activity = factor(activity, levels = c("books", "cc", "ac", "non_tcds")),
id = factor(id),
language = factor(language))
# mlu
mlu <- read_csv("./data_demo_lena_transcripts/mlu.csv") %>%
filter(speech == "all") %>%
mutate(activity = recode(activity, "play" = "cc", "conv" = "cc",
"food" = "cc", "routines" = "cc", "ods" = "non_tcds")) %>%
group_by(id, activity, segment_num, speaker) %>%
mutate(words_sum2 = sum(words_sum),
num_utt_sum2 = sum(num_utt_sum)) %>%
distinct(id, activity, language, speaker, segment_num, words_sum2, num_utt_sum2) %>%
group_by(id, activity, segment_num, speaker) %>%
mutate(mlu_w2 = words_sum2/num_utt_sum2) %>%
distinct(id, activity, language, speaker, segment_num, mlu_w2) %>%
ungroup() %>%
mutate(activity = factor(activity, levels = c("books", "cc", "ac", "non_tcds")),
id = factor(id),
language = factor(language))
# chip
# this includes only caregivers, therefore there is no speaker column
# we exclude periods of ODS because this is about responsiveness to the child during periods of tCDS
chip <- read_csv("./data_demo_lena_transcripts/chip.csv") %>%
filter(activity != "ods") %>%
mutate(activity = recode(activity, "play" = "cc", "conv" = "cc",
"food" = "cc", "routines" = "cc")) %>%
group_by(id, activity, segment_num) %>%
mutate(total_adult_resp2 = sum(total_adult_resp),
total_adult_imitexp2 = sum(total_adult_imitexp),
total_child_utt2 = sum(total_child_utt)) %>%
distinct(id, activity, language, segment_num, total_adult_resp2, total_adult_imitexp2, total_child_utt2) %>%
mutate(prop_adultresp2 = total_adult_resp2/total_child_utt2,
prop_adult_imitexp2 = total_adult_imitexp2/total_child_utt2) %>%
distinct(id, activity, segment_num, language, prop_adultresp2, prop_adult_imitexp2) %>%
ungroup() %>%
mutate(activity = factor(activity, levels = c("books", "cc", "ac")),
id = factor(id),
language = factor(language))
str(freq)
## tibble[,7] [2,870 × 7] (S3: tbl_df/tbl/data.frame)
## $ id : Factor w/ 90 levels "7292","7352",..: 47 47 47 47 50 50 52 52 52 52 ...
## $ activity : Factor w/ 4 levels "books","cc","ac",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ speaker : chr [1:2870] "CHI" "ADULTS" "CHI" "ADULTS" ...
## $ segment_num : num [1:2870] 12 12 15 15 2 2 11 11 5 5 ...
## $ language : Factor w/ 2 levels "english","spanish": 1 1 1 1 1 1 1 1 1 1 ...
## $ tokens_permin2: num [1:2870] 8.46 42.57 5.32 21.75 12.31 ...
## $ types_permin2 : num [1:2870] 4.79 19.73 2.59 9.89 3.61 ...
str(mlu)
## tibble[,6] [2,594 × 6] (S3: tbl_df/tbl/data.frame)
## $ id : Factor w/ 90 levels "7292","7352",..: 46 46 46 46 46 46 46 46 46 46 ...
## $ activity : Factor w/ 4 levels "books","cc","ac",..: 3 3 2 2 4 4 3 3 2 2 ...
## $ speaker : chr [1:2594] "ADULTS" "CHI" "ADULTS" "CHI" ...
## $ segment_num: num [1:2594] 2 2 2 2 2 2 3 3 3 3 ...
## $ language : Factor w/ 2 levels "english","spanish": 1 1 1 1 1 1 1 1 1 1 ...
## $ mlu_w2 : num [1:2594] 3.18 1.89 2.84 1.73 5.5 ...
str(chip)
## tibble[,6] [899 × 6] (S3: tbl_df/tbl/data.frame)
## $ activity : Factor w/ 3 levels "books","cc","ac": 3 2 3 2 3 2 2 3 2 3 ...
## $ id : Factor w/ 90 levels "7292","7352",..: 46 46 46 46 46 46 46 46 46 46 ...
## $ language : Factor w/ 2 levels "english","spanish": 1 1 1 1 1 1 1 1 1 1 ...
## $ segment_num : num [1:899] 2 2 3 3 4 4 5 7 7 8 ...
## $ prop_adultresp2 : num [1:899] 1.35 1.57 1.43 1.65 2.14 ...
## $ prop_adult_imitexp2: num [1:899] 0.391 0.418 0.463 0.35 0.643 ...
# FREQ
freq_adult <- freq %>%
filter(speaker == "ADULTS")
# MLU
mlu_adult <- mlu %>%
filter(speaker == "ADULTS")
# tokens and types: average per activity
freq_adult_per_activity_id <- freq_adult %>%
group_by(id, activity) %>%
mutate(tokens_permin_avg_act = mean(tokens_permin2),
types_permin_avg_act = mean(types_permin2)) %>%
distinct(id, activity, language, tokens_permin_avg_act, types_permin_avg_act) %>%
ungroup() %>%
mutate(activity = factor(activity, levels = c("books", "cc", "ac", "non_tcds")))
# mlu: average per activity
mlu_adult_per_activity_id <- mlu_adult %>%
group_by(id, activity) %>%
mutate(mluw_avg_act = mean(mlu_w2)) %>%
distinct(id, activity, language, mluw_avg_act) %>%
ungroup() %>%
mutate(activity = factor(activity, levels = c("books", "cc", "ac", "non_tcds")))
# chip: average per activity
chip_per_activity_id <- chip %>%
group_by(id, activity) %>%
mutate(prop_adultresp_avg_act = mean(prop_adultresp2, na.rm = T),
prop_adult_imitexp_avg_act = mean(prop_adult_imitexp2, na.rm = T)) %>%
distinct(id, activity, language, prop_adultresp_avg_act, prop_adult_imitexp_avg_act)
# tokens
# all
tokens_mtx_rate <- freq_adult_per_activity_id %>%
dplyr::select(id, language, activity, tokens_permin_avg_act) %>%
pivot_wider(names_from = activity, values_from = tokens_permin_avg_act) %>%
ungroup() %>%
dplyr::select(c("language", "books", "cc", "ac", "non_tcds"))
# types
# all
types_mtx_rate <- freq_adult_per_activity_id %>%
dplyr::select(id, language, activity, types_permin_avg_act) %>%
pivot_wider(names_from = activity, values_from = types_permin_avg_act) %>%
ungroup() %>%
dplyr::select(c("language", "books", "cc", "ac", "non_tcds"))
# all
mlu_mtx <- mlu_adult_per_activity_id %>%
dplyr::select(id, language, activity, mluw_avg_act) %>%
pivot_wider(names_from = activity, values_from = mluw_avg_act) %>%
ungroup() %>%
dplyr::select(c("language", "books", "cc", "ac", "non_tcds"))
# prop responses
# all
propresp_mtx <- chip_per_activity_id %>%
dplyr::select(id, language, activity, prop_adultresp_avg_act) %>%
pivot_wider(names_from = activity, values_from = prop_adultresp_avg_act) %>%
ungroup() %>%
dplyr::select(c("language", "books", "cc", "ac"))
# prop imitations and expansions
# all
propimitexp_mtx <- chip_per_activity_id %>%
dplyr::select(id, language, activity, prop_adult_imitexp_avg_act) %>%
pivot_wider(names_from = activity, values_from = prop_adult_imitexp_avg_act) %>%
ungroup() %>%
dplyr::select(c("language", "books", "cc", "ac"))
# tokens and types rate
zero_corr_tokens_types <- freq_adult_per_activity_id %>%
ungroup() %>%
group_by(id) %>%
mutate(tokens_permin_avg_id = mean(tokens_permin_avg_act),
types_permin_avg_id = mean(types_permin_avg_act)) %>%
distinct(id, language, tokens_permin_avg_id, types_permin_avg_id)
# mlu
zero_corr_mlu <- mlu_adult_per_activity_id %>%
ungroup() %>%
group_by(id) %>%
mutate(mluw_mean_avg_id = mean(mluw_avg_act)) %>%
distinct(id, language, mluw_mean_avg_id)
# chip
zero_corr_chip <- chip_per_activity_id %>%
ungroup() %>%
group_by(id) %>%
mutate(propresp_avg_id = mean(prop_adultresp_avg_act, na.rm = T),
propimitexp_avg_id = mean(prop_adult_imitexp_avg_act, na.rm = T)) %>%
distinct(id, language, propresp_avg_id, propimitexp_avg_id)
# combining tokens, types, mlu
zero_corr_all_measures <- zero_corr_tokens_types %>%
full_join(zero_corr_mlu, by = c("id", "language")) %>%
full_join(zero_corr_chip, by = c("id", "language"))
# matrix
ggpairs(data = zero_corr_all_measures,
columns = 3:7,
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 9)),
title = "zero order corr (avg'd across all activities) - tokens, types, mlu") +
theme(text= element_text(size = 18))
# correlation matrices between verbal engagement measures
# all
zero_corr_all_measures2 <- zero_corr_all_measures %>%
ungroup %>%
dplyr::select(-c(id, language))
rcorr(as.matrix(zero_corr_all_measures2), type = c("pearson"))
## tokens_permin_avg_id types_permin_avg_id mluw_mean_avg_id
## tokens_permin_avg_id 1.00 0.89 0.62
## types_permin_avg_id 0.89 1.00 0.42
## mluw_mean_avg_id 0.62 0.42 1.00
## propresp_avg_id 0.42 0.30 0.19
## propimitexp_avg_id 0.24 0.16 0.20
## propresp_avg_id propimitexp_avg_id
## tokens_permin_avg_id 0.42 0.24
## types_permin_avg_id 0.30 0.16
## mluw_mean_avg_id 0.19 0.20
## propresp_avg_id 1.00 0.42
## propimitexp_avg_id 0.42 1.00
##
## n= 90
##
##
## P
## tokens_permin_avg_id types_permin_avg_id mluw_mean_avg_id
## tokens_permin_avg_id 0.0000 0.0000
## types_permin_avg_id 0.0000 0.0000
## mluw_mean_avg_id 0.0000 0.0000
## propresp_avg_id 0.0000 0.0036 0.0769
## propimitexp_avg_id 0.0249 0.1218 0.0613
## propresp_avg_id propimitexp_avg_id
## tokens_permin_avg_id 0.0000 0.0249
## types_permin_avg_id 0.0036 0.1218
## mluw_mean_avg_id 0.0769 0.0613
## propresp_avg_id 0.0000
## propimitexp_avg_id 0.0000
ggpairs(data = tokens_mtx_rate,
columns = 2:5,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Tokens rate (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
tokens_mtx_rate2 <- tokens_mtx_rate %>% filter(ac < 530)
ggpairs(data = tokens_mtx_rate2,
columns = 2:5,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Rm 1 outlier - Tokens rate (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
# correlation matrices
tokens_mtx_rate_clean <- tokens_mtx_rate %>%
dplyr::select(-language)
rcorr(as.matrix(tokens_mtx_rate_clean), type = c("pearson"))
## books cc ac non_tcds
## books 1.00 0.65 0.29 0.34
## cc 0.65 1.00 0.26 0.51
## ac 0.29 0.26 1.00 0.24
## non_tcds 0.34 0.51 0.24 1.00
##
## n
## books cc ac non_tcds
## books 42 42 42 42
## cc 42 90 90 90
## ac 42 90 90 90
## non_tcds 42 90 90 90
##
## P
## books cc ac non_tcds
## books 0.0000 0.0592 0.0276
## cc 0.0000 0.0121 0.0000
## ac 0.0592 0.0121 0.0238
## non_tcds 0.0276 0.0000 0.0238
ggpairs(data = types_mtx_rate,
columns = 2:5,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Types rate (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
# ggsave("./figures/corr_mtx_types_n90_paper.pdf", width = 11, height = 8, dpi = 300)
types_mtx_rate2 <- types_mtx_rate %>% filter(ac < 530)
ggpairs(data = types_mtx_rate2,
columns = 2:5,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Rm 1 outlier - Types rate (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
# correlation matrices
types_mtx_rate_clean <- types_mtx_rate %>%
dplyr::select(-language)
rcorr(as.matrix(types_mtx_rate_clean), type = c("pearson"))
## books cc ac non_tcds
## books 1.00 0.23 0.13 0.00
## cc 0.23 1.00 0.08 0.22
## ac 0.13 0.08 1.00 0.19
## non_tcds 0.00 0.22 0.19 1.00
##
## n
## books cc ac non_tcds
## books 42 42 42 42
## cc 42 90 90 90
## ac 42 90 90 90
## non_tcds 42 90 90 90
##
## P
## books cc ac non_tcds
## books 0.1377 0.3941 0.9904
## cc 0.1377 0.4523 0.0354
## ac 0.3941 0.4523 0.0738
## non_tcds 0.9904 0.0354 0.0738
mlu_mtx2 <- mlu_mtx %>%
rename("Books" = "books", "Other Child-cent" = "cc", "Adult-cent" = "ac", "non-tCDS" = "non_tcds")
ggpairs(data = mlu_mtx2,
columns = 2:5,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 11)),
title = "MLUw (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 26),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 20),
strip.text.x = element_text(face = "bold", size = 20))
ggsave("./figures/corr_mtx_mlu_n90_paper2.pdf", width = 15, height = 12, dpi = 300)
# correlation matrices
mlu_mtx_clean <- mlu_mtx %>%
dplyr::select(-language)
rcorr(as.matrix(mlu_mtx_clean), type = c("pearson"))
## books cc ac non_tcds
## books 1.00 0.67 0.41 0.26
## cc 0.67 1.00 0.69 0.54
## ac 0.41 0.69 1.00 0.44
## non_tcds 0.26 0.54 0.44 1.00
##
## n
## books cc ac non_tcds
## books 42 42 42 42
## cc 42 90 90 90
## ac 42 90 90 90
## non_tcds 42 90 90 90
##
## P
## books cc ac non_tcds
## books 0.0000 0.0066 0.0918
## cc 0.0000 0.0000 0.0000
## ac 0.0066 0.0000 0.0000
## non_tcds 0.0918 0.0000 0.0000
ggpairs(data = propresp_mtx,
columns = 2:4,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Prop Resp (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
# ggsave("./figures/corr_mtx_propresp_n90_paper.pdf", width = 11, height = 8, dpi = 300)
# correlation matrices
propresp_mtx_clean <- propresp_mtx %>%
dplyr::select(-language)
rcorr(as.matrix(propresp_mtx_clean), type = c("pearson"))
## books cc ac
## books 1.00 0.36 0.21
## cc 0.36 1.00 0.27
## ac 0.21 0.27 1.00
##
## n
## books cc ac
## books 42 42 39
## cc 42 89 86
## ac 39 86 87
##
## P
## books cc ac
## books 0.0202 0.1959
## cc 0.0202 0.0114
## ac 0.1959 0.0114
ggpairs(data = propimitexp_mtx,
columns = 2:4,
switch = 'y',
lower = list(continuous = my_custom_smooth),
upper = list(continuous = wrap("cor", size = 7)),
title = "Prop Imit/Exp (English and Spanish)") +
theme_classic() +
theme(text= element_text(size = 18),
strip.placement = "outside",
strip.text.y = element_text(face = "bold", size = 15),
strip.text.x = element_text(face = "bold", size = 15))
# ggsave("./figures/corr_mtx_propimitexp_n90_paper.pdf", width = 11, height = 8, dpi = 300)
# correlation matrices
propimitexp_mtx_clean <- propimitexp_mtx %>%
dplyr::select(-language)
rcorr(as.matrix(propimitexp_mtx_clean), type = c("pearson"))
## books cc ac
## books 1.00 0.32 0.02
## cc 0.32 1.00 0.31
## ac 0.02 0.31 1.00
##
## n
## books cc ac
## books 42 42 39
## cc 42 89 86
## ac 39 86 87
##
## P
## books cc ac
## books 0.0400 0.9082
## cc 0.0400 0.0043
## ac 0.9082 0.0043
# create dfs with correlations
tokens_corr <- psych::corr.test(tokens_mtx_rate_clean)
types_corr <- psych::corr.test(types_mtx_rate_clean)
mlu_corr <- psych::corr.test(mlu_mtx_clean)
resp_corr <- psych::corr.test(propresp_mtx_clean)
imitexp_corr <- psych::corr.test(propimitexp_mtx_clean)
# get lower, r, upper, and p; lower and upper
tokens_rcip <- tibble::rownames_to_column(round(tokens_corr$ci, 3), "activity") %>%
mutate(x = word(activity, 1, sep = "-"),
y = word(activity, 2, sep = "-")) %>%
mutate(measure = "tokens_rate",
y = recode(y, "nn_tc" = "non_tcds")) %>%
dplyr::select(-activity)
types_rcip <- tibble::rownames_to_column(round(types_corr$ci, 3), "activity") %>%
mutate(x = word(activity, 1, sep = "-"),
y = word(activity, 2, sep = "-")) %>%
mutate(measure = "types_rate",
y = recode(y, "nn_tc" = "non_tcds")) %>%
dplyr::select(-activity)
mluw_rcip <- tibble::rownames_to_column(round(mlu_corr$ci, 3), "activity") %>%
mutate(x = word(activity, 1, sep = "-"),
y = word(activity, 2, sep = "-")) %>%
mutate(measure = "mluw",
y = recode(y, "nn_tc" = "non_tcds")) %>%
dplyr::select(-activity)
resp_rcip <- tibble::rownames_to_column(round(resp_corr$ci, 3), "activity") %>%
mutate(x = word(activity, 1, sep = "-"),
y = word(activity, 2, sep = "-")) %>%
mutate(measure = "prop_resp") %>%
dplyr::select(-activity)
imitexp_rcip <- tibble::rownames_to_column(round(imitexp_corr$ci, 3), "activity") %>%
mutate(x = word(activity, 1, sep = "-"),
y = word(activity, 2, sep = "-")) %>%
mutate(measure = "prop_imit_exp") %>%
dplyr::select(-activity)
df_forest <- rbind(tokens_rcip, types_rcip, mluw_rcip, resp_rcip, imitexp_rcip) %>%
mutate(x = recode(x, "books" = "Books", "cc" = "Other Child-cent",
"ac" = "Adult-cent", "non_tcds" = "non-tCDS"),
y = recode(y, "books" = "Books", "cc" = "Other Child-cent",
"ac" = "Adult-cent", "non_tcds" = "non-tCDS")) %>%
mutate(x = factor(x, levels = c("Books", "Other Child-cent", "Adult-cent", "non-tCDS")),
y = factor(y, levels = c("Books", "Other Child-cent", "Adult-cent", "non-tCDS")),
measure = factor(measure, levels = c("tokens_rate", "types_rate", "mluw",
"prop_resp", "prop_imit_exp"))) %>%
as_tibble()
# forest plot
# https://datascienceplus.com/lattice-like-forest-plot-using-ggplot2-in-r/
ggplot(df_forest,
aes(x = measure,y = r, ymin = lower, ymax = upper))+
scale_colour_gradient(low = "green", high = "red") +
geom_pointrange(aes(shape = measure, color = r), size = 1) +
geom_rect(aes(xmin = 0, xmax = Inf, ymin = Inf, ymax = .3),
fill = "grey", alpha = 0.08) +
# geom_hline(yintercept = 0, linetype = 1) +
geom_hline(yintercept = 0, linetype = 2) +
xlab('Activity')+ ylab("\nPearson's r and Confidence Interval") +
facet_grid(x ~ y, switch = "y") +
theme_classic() +
theme(plot.title = element_text(size = 16,face = "bold"),
strip.placement = "outside",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title = element_text(size = 26),
axis.title.y = element_text(vjust = 3),
axis.text.y = element_text(size = 20),
strip.text.y = element_text(face = "bold", size = 25),
strip.text.x = element_text(face = "bold", size = 25)) +
theme(legend.position = c(0.1, 0.35),
legend.key.size = unit(1.5, 'cm'),
legend.text=element_text(size = 16),
legend.title = element_text(size = 16))
ggsave("./figures/eff_sizes_4act2.pdf", width = 15, height = 12, dpi = 300)
Choosing books and play since this has the most consistent positive and moderate - strong correlations across all input measures
# books ~ cc
lm_tokens_rate1 <- lm(books ~ cc, data = tokens_mtx_rate)
lm_tokens_rate2 <- lm(books ~ cc + language, data = tokens_mtx_rate)
lm_tokens_rate3 <- lm(books ~ cc * language, data = tokens_mtx_rate)
anova(lm_tokens_rate1, lm_tokens_rate2, lm_tokens_rate3)
## Analysis of Variance Table
##
## Model 1: books ~ cc
## Model 2: books ~ cc + language
## Model 3: books ~ cc * language
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 40 18310
## 2 39 15190 1 3119.40 7.8119 0.008092 **
## 3 38 15174 1 16.57 0.0415 0.839669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary
summary(lm_tokens_rate2)
##
## Call:
## lm(formula = books ~ cc + language, data = tokens_mtx_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.107 -11.667 -1.544 12.003 52.206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9838 10.9859 4.550 5.12e-05 ***
## cc 0.6769 0.1459 4.639 3.89e-05 ***
## languagespanish -18.1471 6.4125 -2.830 0.00732 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.74 on 39 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.5164, Adjusted R-squared: 0.4916
## F-statistic: 20.82 on 2 and 39 DF, p-value: 7.044e-07
# diag
plot_model(lm_tokens_rate1, type = "pred", terms = c("cc"),
show.data = T,
dot.size = 4)
# books ~ cc
lm_types_rate1 <- lm(books ~ cc, data = types_mtx_rate)
lm_types_rate2 <- lm(books ~ cc + language, data = types_mtx_rate)
lm_types_rate3 <- lm(books ~ cc * language, data = types_mtx_rate)
anova(lm_types_rate1, lm_types_rate2, lm_types_rate3)
## Analysis of Variance Table
##
## Model 1: books ~ cc
## Model 2: books ~ cc + language
## Model 3: books ~ cc * language
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 40 3937.8
## 2 39 3556.1 1 381.67 4.0798 0.05049 .
## 3 38 3554.9 1 1.19 0.0127 0.91082
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary
summary(lm_types_rate1)
##
## Call:
## lm(formula = books ~ cc, data = types_mtx_rate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.170 -7.572 -1.044 4.320 21.398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.0053 3.5119 6.836 3.18e-08 ***
## cc 0.2017 0.1332 1.515 0.138
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.922 on 40 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.05425, Adjusted R-squared: 0.0306
## F-statistic: 2.294 on 1 and 40 DF, p-value: 0.1377
# diag
plot_model(lm_types_rate1, type = "pred", terms = c("cc"),
show.data = T,
dot.size = 4)
# books ~ cc
lm_mluw1 <- lm(books ~ cc, data = mlu_mtx)
lm_mluw2 <- lm(books ~ cc + language, data = mlu_mtx)
lm_mluw3 <- lm(books ~ cc * language, data = mlu_mtx)
anova(lm_mluw1, lm_mluw2, lm_mluw3)
## Analysis of Variance Table
##
## Model 1: books ~ cc
## Model 2: books ~ cc + language
## Model 3: books ~ cc * language
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 40 41.749
## 2 39 41.542 1 0.20726 0.1965 0.6600
## 3 38 40.071 1 1.47103 1.3950 0.2449
# summary
summary(lm_mluw1)
##
## Call:
## lm(formula = books ~ cc, data = mlu_mtx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.89878 -0.81209 -0.00908 0.55754 2.81095
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5193 0.8698 -0.597 0.554
## cc 1.3967 0.2445 5.711 1.2e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.022 on 40 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.4492, Adjusted R-squared: 0.4354
## F-statistic: 32.62 on 1 and 40 DF, p-value: 1.202e-06
# diag
plot_model(lm_mluw1, type = "pred", terms = c("cc"),
show.data = T,
dot.size = 4)
# books ~ cc
lm_propresp1 <- lm(books ~ cc, data = propresp_mtx)
lm_propresp2 <- lm(books ~ cc + language, data = propresp_mtx)
lm_propresp3 <- lm(books ~ cc * language, data = propresp_mtx)
anova(lm_propresp1, lm_propresp2, lm_propresp3)
## Analysis of Variance Table
##
## Model 1: books ~ cc
## Model 2: books ~ cc + language
## Model 3: books ~ cc * language
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 40 15.508
## 2 39 14.947 1 0.56180 1.4527 0.2355
## 3 38 14.695 1 0.25115 0.6494 0.4253
# summary
summary(lm_propresp1)
##
## Call:
## lm(formula = books ~ cc, data = propresp_mtx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15412 -0.33585 -0.00945 0.32105 1.89345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1820 0.4551 2.598 0.0131 *
## cc 0.5838 0.2414 2.418 0.0202 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6227 on 40 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.1275, Adjusted R-squared: 0.1057
## F-statistic: 5.847 on 1 and 40 DF, p-value: 0.02025
# diag
plot_model(lm_propresp2, type = "pred", terms = c("cc"),
show.data = T,
dot.size = 4)
# books ~ cc
lm_propimitexp1 <- lm(books ~ cc, data = propimitexp_mtx)
lm_propimitexp2 <- lm(books ~ cc + language, data = propimitexp_mtx)
lm_propimitexp3 <- lm(books ~ cc * language, data = propimitexp_mtx)
anova(lm_propimitexp1, lm_propimitexp2, lm_propimitexp3)
## Analysis of Variance Table
##
## Model 1: books ~ cc
## Model 2: books ~ cc + language
## Model 3: books ~ cc * language
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 40 1.8309
## 2 39 1.7641 1 0.066763 1.4407 0.2375
## 3 38 1.7610 1 0.003116 0.0672 0.7968
# summary
summary(lm_propimitexp1)
##
## Call:
## lm(formula = books ~ cc, data = propimitexp_mtx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32514 -0.18637 -0.01946 0.12972 0.64645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.27359 0.08749 3.127 0.00328 **
## cc 0.43258 0.20376 2.123 0.03999 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2139 on 40 degrees of freedom
## (48 observations deleted due to missingness)
## Multiple R-squared: 0.1013, Adjusted R-squared: 0.0788
## F-statistic: 4.507 on 1 and 40 DF, p-value: 0.03999
# diag
plot_model(lm_propimitexp1, type = "pred", terms = c("cc"),
show.data = T,
dot.size = 4)